
clear all
set matsize 4000
set type double, permanently

* set file path 
gl dat ""
gl do ""
gl text ""
gl texg ""


cap pr drop max
pr de max
	ren `1' `1'_
	bys id: egen `1'=max(`1'_)
	drop `1'_
end

cap pr drop max2
pr de max2
	ren `2' `2'_
	bys `1': egen `2'=max(`2'_)
	drop `2'_
end



use "$dat/ZEMIS/ZEMIS.dta", clear
ren arb_aktiv_flag arb_flag
ren stus* status*
append using "$dat/ZEMIS/ZEMIS_20162017.dta"
ren status* stus*
tostring pers_id, replace

preserve
*to merge with comment file
keep kt_kz auf_b_jm stich_jm pers_id pz1_b_akt_jm
save "$dat/ZEMIS_all.dta", replace
restore


********************
* LABEL AND RENAME *
********************
qui do "$do/label_zemis.do" // label variables

ren arb_flag emp
ren arb_dau days_emp
ren arb_vonjm emp_start
la var bur_id "Business id"
la var dos_id "File id"
la var verf_id "Case id"
ren pers_id id
ren arb_stell_cd emp_type
ren taet_typ_cd industry1
ren bra_typ_cd industry2
ren gem_cd mun
ren gebnati_cd country_born
ren ziv_cd civ
ren pz1_b_akt_jm case_begin
ren pz1_dau case_days
ren pz1_e_akt_jm case_end
ren pz2_b_akt_jm case2_begin
ren kt_kz cant
la var cant "Canton, assigned"
ren evz_cd center
ren nati_cd country 
la var cant_res "Canton, residence"
ren auf_b_jm arrive
ren auf_dau days
ren geb_jmt bdate
ren pz1_e_akt_cd stat_end


*year
g year=int(stich_jm/100)
drop stich_jm

foreach x in arrive case_begin case2_begin case_end emp_start {
replace `x'=. if `x'==207012 //means no info
}

*woman
replace sex_kz="" if inlist(sex_kz, "!", "?") // 0.6 %
g woman=(sex_kz=="F") if !mi(sex_kz)
drop sex_kz

*civil status
recode civ 9=. // unknown 4,5 %
g married=(civ==2|civ==10) if !mi(civ)

*birthdate
replace bdate=. if bdate==18500101 // 0.5 %

tostring bdate, replace 
g born = date(bdate,"YMD")
format born %dd_m_CY
drop bdate


*country
foreach x in country country_born {
recode `x' 997=998 // includes Stateless + Without nationality
recode `x' (254 256 = 248) //recode new countries Kosovo and Montenegro  to Serbia
recode `x'  363=350 // recode South Sudan to Sudan
}

qui do "$do/label_country.do" 
la val country country_lab
la val country_born country_lab

*drop strange countries, some may be accurate, but others typos or the country of citizenship has been replaced by country of birth (according to newspaper) 
foreach x in country  {
dec `x', g(country_name)
replace `x'=. if inlist(country_name, "Australia", "Austria", "Belgium", "Canada", "Denmark", "Finland","France", "Germany","Great Britain") 
replace `x'=. if inlist(country_name, "Greece","Ireland","Italy", "Luxembourg","Netherlands", "New Zealand", "Norway", "Portugal", "Spain")
replace `x'=. if inlist(country_name, "Sweden" , "USA", "Liechtenstein", "Malta") 
drop country_name
}


*************************
** Correct error in id **
*************************
* Individuals with different id who share the same verf_id appear to be the same persons (same country, gender and birth date), but they have applied multiple times, connected with fingerprints in database
* Give them one id number if they never appear in the same year (assume any twins would appear in data during the same year) so that count of uniqe individuals is correct
* Correcting the error makes number of unique id 870 091 --> 816 365 

replace country=. if inlist(country,-1,-2, 999)
clonevar id_orig=id
clonevar verf_id2 = verf_id
group_id verf_id2, match(id) // create a common group id (=verf_id2) for all id that are connected through a verf_id and include all observations for those id

duplicates tag verf_id2 year, g(dup) // 1418, max 2 per year
bys verf_id2: egen twin=max(dup) // potential twin

* create common id for all id (at most 3) that are connected through verf_id and also have the same gender + country + birth date
bys verf_id2 (id): g rank = sum(id != id[_n-1]) 
foreach x in born country woman {
bys verf_id2: egen `x'verf_sd=sd(`x')
}
bys verf_id2 (year id): replace id=id[_n-1] if _n>1 & bornverf_sd==0 & countryverf_sd==0 & womanverf_sd==0  & twin==0

* if only 2 (of 3) id have the same gender country birth date, create common ID for those
bys verf_id2: egen maxrank=max(rank) 

foreach x in born country woman {
bys verf_id2: egen `x'verf_sd12=sd(`x') if (rank==1 |rank==2)  & maxrank==3
bys verf_id2: egen `x'verf_sd13=sd(`x') if (rank==1 |rank==3)  & maxrank==3
bys verf_id2: egen `x'verf_sd23=sd(`x') if (rank==3 |rank==2)  & maxrank==3
}

bys verf_id2 bornverf_sd12 (year): replace id=id[_n-1] if _n>1 & bornverf_sd12==0 & countryverf_sd12==0 & womanverf_sd12==0  & twin==0
bys verf_id2 bornverf_sd13 (year): replace id=id[_n-1] if _n>1 & bornverf_sd13==0 & countryverf_sd13==0 & womanverf_sd13==0  & twin==0
bys verf_id2 bornverf_sd23 (year): replace id=id[_n-1] if _n>1 & bornverf_sd23==0 & countryverf_sd23==0 & womanverf_sd23==0  & twin==0


drop bornverf* countryverf* womanverf*  maxrank rank dup*

g id_error=(id_orig!=id)
max id_error


******************************************************************
* add info about requests to change canton during first 3 months *
******************************************************************
*note these variables are only defined for those assigned within 3 months
merge m:1 id_orig using "$dat/comments.dta"
tab cantreq if _merge==3
drop if _merge==2
drop _merge

***********************************
* Fill in gaps for some variables *
***********************************
bys id (year): g yearN=_n
bys id: egen yearS=max(yearN)
g fyear=year if yearN==1
max fyear

destring id, replace
tsset id yearN

foreach x in country country_born ethn_cd reli_cd spr_cd {
recode `x'  (-2 -1 999 =.) // includes Without specification + Unknown + State unknown 
}

replace cant="" if inlist(cant,"Without Spec","Unknown") 
replace cant_res="" if inlist(cant_res,"Unknown","Without Spec")

replace center=. if center<0

*fill in gaps
foreach x in center ethn_cd reli_cd spr_cd born country woman country_born arrive case_begin case_end cantass cantreq nowunsch_info nocomcant_info french_info german_info civ married cant cant_res {
bys id (yearN): carryforward(`x'), replace
}

*variables that should be constant over time
gsort id -yearN
foreach x in born country woman country_born ethn_cd reli_cd spr_cd {
bys id: carryforward(`x'), replace
}

foreach x in ethn_cd reli_cd spr_cd country country_born {
replace `x'=-1 if mi(`x') //without specification + Unknown 
ren `x' `x'_orig
g `x'=`x'_orig if yearN==yearS
max `x'
pwcorr `x' `x'_orig // 0.985-0.999
cap la val `x' `x'
}

* variables that change if re-apply
foreach x in arrive case_begin {
g fm`x'=`x' if yearN==1
g f`x'=int(fm`x'/100)
max f`x'
max fm`x'
g m`x'=`x'
replace `x'=int(`x'/100)
}

* info on center is added for everyone 2009, even those arriving earlier 
gsort id -year
by id : carryforward center, replace 
g fcenter=center if yearN==1
max fcenter



***********************
*   code covars       *
***********************

* variation in variables that should be constant
foreach x in  born woman  { 
bys id: egen `x'_sd = sd(`x')
}

g byear=year(born) 
g age=year-byear 

g born_swiss=1 if age==0
max born_swiss

bys id: egen fmarried=max(married==1 & yearN==1)


*continent
recode country (201/299=1) (301/399=2) (401/438=3) (501/599=4) (998=6), g(continent) 
replace continent=5 if inlist(country,502, 242, 359, 513, 512, 514, 517, 521, 523, 527, 519, 535, 541, 516, 239, 532) //Middle east, No code for palestine
replace continent=. if continent<1 | continent>6
cap la drop cont
la de cont 1 "East Europe" 2 Africa 3 SAmerica 4 Asia 5 "Middle East" 6 "Without/stateless"
la val continent cont

g age_arrive=farrive-byear
g age_arrive2=age_arrive*age_arrive 



*************
* Decision **
*************

*get F or B/C
g getF=(stus_flag==40)
g getBC=(stus_flag==60)

g preproc=(stus_flag==10|stus_flag==15) // "pre-proccesed" 
max preproc

sort id year
foreach x in getF getBC {
g year_get=year if `x'==1
bys id: egen y`x' =min(year_get)  
replace y`x'=year-y`x' //0 in year one gets it
max `x'
drop  year_get
}

*******************
**  Wait period ***
*******************
* case_days=days since case starts, continues to be counted when someone gets F, only stops if get B,  case_days are counted from 0 again if a new application
* days=days in switzerland, stops increasing after decision, but keep on increasing if only get F (0 if missing arrive)
* 5 year rule hardship based on time in Switzerland (not time case was ongoing) 

replace days=.  if days<0 & !mi(days) 
replace case_days=. if case_days<0 & !mi(case_days) 

*g error=(days!=case_days)
*sum error // 3 % of obs
*drop error

g days0=(days==0)
bys id: egen days0_sd=sd(days0)
bys id: egen days_0 =max(days==0 & (days0_sd==0|days0_sd==.)) 
drop days0 days0_sd


*count days per year instead of cumulative
ren days days_orig                    
xtset id yearN
g days=d.days_orig if yearN>1
replace days=days_orig if yearN==1

tab year if days<0 & !mi(days) // often related to stat_end=departure

g gap_day=(yearN!=1 & (days<365| days>366) & !mi(days) & ygetF<0 & getF==1) // create variable indicating gaps before receiving first permit
replace gap_day=1 if yearN!=1 & (days<365| days>366) & !mi(days) & ygetBC<0 & getF==0 & getBC==1
max gap_day  
sum gap_day if yearN==1 & (getF==1|getBC==1) // 4.8 % 

replace days=days_orig if days<0 //starting new count
replace days=365 if days>366 & !mi(days)

**calculate total number of days, but note that it's only reliable for period before permit without gap_days
g days_total=0
bys id (year): replace days_total=sum(days) if inrange(year,1994,2017)

g wait_perm=days_total if getF==1 & ygetF==0 // wait time for first permit
replace wait_perm=days_total if getF==0 & getBC==1 & ygetBC==0
max wait_perm
g lnwait_perm=ln(wait_perm+1)
 


******************************
*  Detect family or relatives*
******************************

* sometimes the same person has multiple dos_id 
bys dos_id id: g new= _n==1
bys id: egen dos_sum1=sum(new) // max 3 files per person id 
bys dos_id: egen dos_sum2=sum(new) //max 27 pers per dos id

clonevar dos_id2 = dos_id
group_id dos_id2, match(id) // connect all individuals that shared a dos_id 

* create variable of who arrives first 
tostring fmarrive, replace
g t_fmarrive=date(fmarrive, "YM")
replace t_fmarrive=mofd(t_fmarrive) // so that +1 for each month

bys dos_id2: egen temp_fam=min(t_fmarrive) // based on year + month
g arrive_first=(t_fmarrive==temp_fam) if !mi(farrive) //can still have multiple people arrive at the same time

drop temp* t_fmarrive

* create variable famsize and arrive_with_child
bys dos_id2 fmarrive (id year): g famsize_ = sum(id!=id[_n-1]) 
bys dos_id2 fmarrive: egen famsize=max(famsize_) if !mi(dos_id2) & !mi(fmarrive)  
drop famsize_ 

bys dos_id2 fmarrive : egen min_age=min(age_arrive) 
g with_child=(age_arrive>18  & min_age<=18 )  if !mi(age_arrive)
drop min_age*


************************
** assignment canton  **
************************

* tab cantreq arrive_first if yearN==1, col
* 70 % of individuals who don't arrive first request a wunschkanton in year 1, i.e. probably request to move to family member

g move_later=(cantass!=cant_res) if !mi(cant_res) & !mi(cantass)
*sum move_later if getF==1 & yearN==5 // 5%
*sum move_later if getBC==1 & yearN==5 // 7%

egen cantnum=group(cantass)

* create variable indicating language canton-origin country match
g langreg = "I" if cantass=="TI"
replace langreg="F" if inlist(cantass, "BE", "FR", "VS","GE","NE","VD","JU")
replace langreg = "G" if inlist(cantass,"AG","AR","AI","BS","BL","GL","LU","NW","OW")
replace langreg = "G" if inlist(cantass,"SH","SZ","SO","SG","TG","UR","ZG","ZH", "GR")
g french_cant=(langreg=="F") if !mi(langreg)
g french=inlist(country, 301,303, 308,309,310,311,315,317,321,322,323,327,330, 332,333,335,337,341,345,346,354,356,360) if !mi(country)
g french_col=inlist(country,301,303,304,308,309,310,311,315,317,321,322,323,327,330,331,332,333,335,337,341,345,346,354,356,357,360,372) if !mi(country)
replace french=1 if french_col==1
g french_int=french*french_cant


*******************
**  Outcomes    ***
*******************

* code indicator for change of  employer 
g bur_id_temp=bur_id if bur_id!=-2
sum bur_id_temp
bys id (year): carryforward bur_id_temp, replace
bys id (year): g new_emp=(bur_id_temp!=bur_id_temp[_n-1])
drop bur_id_temp

*employment
ren days_emp days_emp_orig //cumulative number of days with current employer
recode days_emp_orig (-5 -3 -2 -1 =0)  //from year 2000 no values of -2, instead 0 

replace days_emp_orig = 0 if days_emp_orig==.   
cap destring id, replace                 
xtset id yearN
g days_emp=d.days_emp_orig if yearN>1
replace days_emp=days_emp_orig if yearN==1 & fyear!=1994 // don't know if refer to current year in 1994
replace days_emp=days_emp_orig if yearN==1 & fyear==1994 & emp==1
replace days_emp=days_emp_orig if new_emp==1 // new employer
replace days_emp=days_emp_orig if year==int(emp_start/100) //new start date
replace days_emp=days_emp_orig if days_emp<0 & !mi(days_emp) // 99 % = 0 (i.e unemployed)  
replace days_emp=365  if days_emp>366 & !mi(days_emp) // 90 % before 2001 
replace days_emp=0 if mi(days_emp)

*sum year if days_emp>0 & emp==0 //  only before 1999
*sum year if days_emp==0 & emp==1 // 291 obs

g days_emp_total=0
bys id (year): replace days_emp_total=sum(days_emp) if inrange(year,1994,2017)

g emp90=(days_emp>=90) if !mi(days_emp)




***************
***************
***************
drop if farrive<1994 & !mi(farrive) // these individuals are not observed from start, around 80 percent of them miss info on ethnicity
drop if fyear==1994 & mi(farrive)

tostring id, replace
compress

save "$dat/ZEMIS_start.dta", replace





